#!/usr/bin/python
# -*- coding: UTF-8 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.0c  - 2-11-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python: Main File
###########################

"""
Synonym bzw Wortersetzung parallelisieren für schnellere Verarbeitung und Reaktionszeit des Tools

Antonym Datenbank Entwicklung mit Hilfe gecrawlter Websites

Aufbau einer Datenbank mit einfacher deutscher Sprache

Berechnung des lesbarkeitswerts eines eingabetextes - basierend auf einfachen Texten die "simple German " Datenbank für Austausch nutzen, Wissenschaftliche Texte mit Leipzig und unserer lokalen Synonym Datenbank austauschen

Tests am 29.10.2015:
https://github.com/rsennrich/clevertagger

"""

#https://docs.python.org/2/library/configparser.html
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append('/home/onetipp/python/modules')
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'

import random
import codecs
import re
import mod
import stopwords
import pprint
import pattern.de
from pattern.de import conjugate
from pattern.de import INFINITIVE, PRESENT, PAST, SG, SUBJUNCTIVE

from textblob_de import TextBlobDE as TextBlob
from textblob_de import PatternTagger
from textblob_de import TextBlobDE
import treetaggerwrapper

#cursorMysql = mod.mysql.cursor()

noDoubleHash    = set()
re_match        = r"(\?|\.|\!)" # Match: ". WORT"

# # sent_tokenize_list = sent_tokenize(text)
# # Summarize the text first and then work on it
# tSumy       = mod.summarizeText(text)
# #tokens      = mod.nltk.word_tokenize(tSumy)
# tokens      = mod.nltk.sent_tokenize(tSumy,  language='german')
# tokensRaw   = mod.nltk.word_tokenize(text)
#cursorMysql.execute("SELECT p_articletext FROM (publish_de) ORDER BY RAND() LIMIT 1;")
#cursorMysql.execute("SELECT p_articletext FROM (publish_de) WHERE BINARY `id` = '%s' LIMIT 1;" % (word))
import re

# https://perso.limsi.fr/pointal/doku.php?id=dev:treetaggerwrapper
# https://subversion.renater.fr/ttpw/trunk/treetaggerwrapper.py
# http://treetaggerwrapper.readthedocs.org/en/latest/#polls-of-taggers-process
# result = cursorMysql.fetchall()

# lies die Ein und Ausgabedateien
inputfile  = sys.argv[1]

# read file into string
# text = codecs.open(inputfile, "r", encoding='utf-8').read()
text = codecs.open(inputfile, "r").read()

tagger = treetaggerwrapper.TreeTagger(TAGLANG='de', TAGDIR='/home/onetipp/software/treetagger/')
GermanStopwords = stopwords.getGermanStopwords()
GermanSTTLIgnoreTags = stopwords.getSttsIgnoreTags()

tokens      = mod.nltk.sent_tokenize(text,  language='german')

#http://www.clips.ua.ac.be/pages/pattern-de
list_conjugate = [
    "VAFIN",
    "VVFIN",
]

ListFinal = []

for s in tokens:
    if s is not None:
        #print("Satz: ", s)
        unicode_text = mod.safe_unicode(s)
        #tSumy       = mod.summarizeText(r)
        tags = tagger.tag_text(unicode_text)
        tags2 = treetaggerwrapper.make_tags(tags)

        #pprint.pprint(tags2)

        for ele in tags2:
            if ele:
                word_tmp = ele[0]
                unicode_text = mod.safe_unicode(ele[0])
                word = unicode_text.encode('utf-8')

# Sie zeigt auf der Karte wo die Stadt Moskau ist.
# Neu: Sie zeigt auf der Karte wo die Stadt Moskau wäre.
                #todo: if POS-TAG==NE and NEXT-POS-TAG == VV VFIN etc then VV or VVFIN bleibt so wie es ist.

                #pos_tag = ele[1].encode("ascii")
                pos_tag_tmp = ele[1]
                unicode_text = mod.safe_unicode(ele[1])
                pos_tag = unicode_text.encode('utf-8')

               # print("<br />) Wort:", word, " > Pos:", pos_tag, "<br />")

                if pos_tag not in GermanStopwords and pos_tag not in GermanStopwords:
                     #   print("Pos tag to possible Change:",pos_tag)
                     #   print("Word:", word)
                    if pos_tag in list_conjugate:
                        conj_tmp = conjugate(word, PAST, 1, SG, mood=SUBJUNCTIVE)
                        unicode_text = mod.safe_unicode(conj_tmp)
                        conj = unicode_text.encode('utf-8')

                       # print("Word Past: ", conj, " - Lenght: " ,len(ListFinal) ,"<br />")
                        ListFinal.append("<b style=\"color:#57B055;\"><i>")
                        ListFinal.append(conj)
                        ListFinal.append("</i></b>")
                        continue
                else: # stopwordlist
                    1

               # udata=word.decode("utf-8")
               # asciidata=udata.encode("ascii","ignore")

                ListFinal.append(word)

#https://pypi.python.org/pypi/languagedet

# file schreiben
#readabilityVar      = str(mod.textstat.flesch_reading_ease(text))


writeThis = " ".join(ListFinal)
writeThis.encode('utf-8')

with codecs.open("/tmp/onetipp_tmp.txt", 'wb+', encoding='utf-8') as f:
    f.write(writeThis)
    f.close()

# mod.mysql.commit()
# mod.mysql.close()
#
# mod.sphinx.commit()
# mod.sphinx.close()

exit(0)


"""
The Flesch Reading Ease formula

function name - flesch_reading_ease(text)

returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document.

90-100 : Very Easy
80-89 : Easy
70-79 : Fairly Easy
60-69 : Standard
50-59 : Fairly Difficult
30-49 : Difficult
0-29 : Very Confusing

"""